\ifx \allfiles \undefined
\documentclass{article}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{graphicx}

\begin{document}
\title{Method}

\newcommand{\following}{\ensuremath{following}}
\newcommand{\follower}{\ensuremath{follower}}

\maketitle \else \fi

\newcommand{\argmax}{\operatornamewithlimits{argmax}}

\section{Our Approach}\label{sec:method}

\subsection{Snowball Algorithm}
%In considering the operational models for the extraction of a profile keyword of a user through a social network $G$, we will first treat this problem as an information influence spread processing.
Twitter user follows other users based on his own interest, while user's tweets are pushed to his followers. Given a category $\mathcal{C}$, the probability of a user $u$ belonging to $\mathcal{C}$ is determined by the relevance score $s_u$. Assume the probability of $u$ publishing a tweet belonging to $\mathcal{C}$ is also $s_u$, and each user publishes the same number ($k$) of tweets. Then the total number of tweets received by $u$ is $|\following(u)|k$, while $\sum_{v \in \following(u)} s_v k$ of them belong to $\mathcal{C}$. The probability of receiving a tweet in $\mathcal{C}$ by $u$ is given by
$${\sum_{v \in \following(u)} s_v k \over |\following(u)|k} = \sum_{v \in following(u)}{s_v \over |\following(u)|}.$$
Further assume that a user publishes exactly what he receives, then the probability of publishing a tweet in $\mathcal{C}$ by $u$ is
\begin{equation}\label{eq:snowball}
s_u = \sum_{v \in \following(u)} \frac{s_v}{|\following(u)|}.
\end{equation}

%If a user is a UCLA student, he is likely to follow other users in UCLA. Similarly, a user who belongs to category $\mathcal{C}$ is likely to follow some other users in $\mathcal{C}$. Thus, a very simple and intuitive relevance score measure is to calculate the fraction of a user's followings that belong to category $\mathcal{C}$, i.e., calculate $s_u$ according to:

%Given a category $\mathcal{C}$ and a keyword $z_{\mathcal{C}}$ which best describes $\mathcal{C}$, each user $u \in V$ has an active score $s_u$ representing the likelihood that $u$ belongs to category $\mathcal{C}$. If $s_u \geq 1$, user $u$ is \emph{active}, indicating $u$ belongs to $\mathcal{C}$; otherwise, $u$ is \emph{inactive}. The snowball algorithm calculates $s_u$ based on the linear threshold model proposed in \cite{snowball1,snowball2}, which captures the influence propagation process in the network.

%For more information, please view paper Maximizing the spread of influence through a social network, page 2.

%Given keyword $z_{\mathcal{C}}$, users are partitioned into sets $\mathcal{A}$ and $\mathcal{B}$. The users in $\mathcal{A}$ are active, and users in set $\mathcal{B}$ are inactive. We focus on settings guided by previous work about the spread of influence in which each user's state to become active increases monotonically as more of its friends become actives. The snowball algorithm is an iterative algorithm. Once a user become active, it remains active in the future iterations. For an inactive user $u$, more and more followings of $u$ become active after many iterations, and $u$ might become active if enough
%as time passed, more and more of users followed by $u$ become active; at some point, this cause $u$ to become active, and $u$ influenced his followers.

The snowball algorithm iteratively calculates the value of $s_u$ according to Eqn.~(\ref{eq:snowball}). Initially, $s_u$ is set to 1 for $u \in \mathcal{A}$, and $s_u$ is 0 for $u \in \mathcal{B}$. During each iteration, the algorithm scans through each user $u \in \mathcal{B}$, and updates $s_u$ according to Eqn.~(\ref{eq:snowball}) (the $s_v$ value in equation is the old value in the last iteration). The algorithm iterates until the terminate condition is satisfied. Typical terminate condition can be an upper bound on the iteration number, or the ranking of users does not change during two consecutive iterations. The users in $\mathcal{B}$ are ranked based on the final value of $s_u$.

%As shown in Algorithm~\ref{alg:snowball}, the snowball algorithm is an iterative algorithm. Initially, the active score of users in $\mathcal{A}$ is set to 1, while the active score of users in $\mathcal{B}$ is set to 0. So only users in $\mathcal{A}$ are active at the beginning. During each iteration, the algorithm scans through each inactive user $u$ and calculate its new active score based on the active score of $u$'s followings:
%\begin{equation}\label{eq:snowball}
%s_u = \sum_{v \in \following(u)} \frac{s_v}{|\follower(v)|}
%\end{equation}
%If the new active score $s_u$ is no less than 1, then user $u$ becomes active. Although user $u$ is inactive at the beginning, he might become active as many of his following becomes active during the iteration, and he will be able to contribute to his followers once he becomes active. The algorithm iterates until no more users could become active. Users in $\mathcal{B}$ are ranked based on the time at which they become active.
%In order to obtain the ranking result for $z_{\mathcal{C}}$, here we rank the inactive users in $D$ according to their time point to become an active user.

%------------------------------------------------
%\begin{algorithm}[htbp]
%\caption{\textsc{Snowball}}
%\textbf{Input: }{Category $\mathcal{C}$ and keyword $z_{\mathcal{C}}$}\\
%\textbf{Output: }{An array $rank$ containing the users in $\mathcal{B}$ ranked on the probability of belonging to $\mathcal{C}$}
%\begin{algorithmic}[1]
%\STATE $s_u \leftarrow 1$ {\bf for} $u \in \mathcal{A}$, $s_u \leftarrow 0$ {\bf for} $u \in \mathcal{B}$
%\WHILE{$rank$ changed during last iteration}
%\FOR{$u \in V$ s.t. $s_u < 1$}
%    \STATE update $s_u$ according to Eqn.~(\ref{eq:snowball})
%    \STATE {\bf if} $s_u \geq 1$, {\bf then} add $u$ to $rank$
%\ENDFOR
%\ENDWHILE
%\STATE {\bf for} $u \in V$ s.t. $s_u < 1$ {\bf do} add $u$ to $rank$
%\RETURN $rank$
%\end{algorithmic}
%\label{alg:snowball}
%\end{algorithm}
%------------------------------------------------

\subsection{Bidirectional Snowball Algorithm}

%The underlying graph structure of twitter social network is similar to that of the Web graph, where a user in twitter corresponds to a page in the Web, and a following relation between two users corresponds to a directed hyperlink between two pages. Existing methods like PageRank\cite{page1999pagerank}, TrustRank\cite{gyngyi2004combating} and random walks with restarts\cite{tong2006fast} are popular random walk based methods for ranking nodes in the Web graph. Our second approach uses a variant of TrustRank to calculate the relevance score $s_u$ for each user $u$. The idea is user $v$ positively contributes to $u$'s relevance score if $v$ is $u$'s following, and user $w$ also positively contributes to $u$'s relevance score if $w$ is $u$'s follower.

The snowball algorithm considers the tweets propagation from user to his followers. The inverse part of this process is the propagation from a user to his followings. A user in $\mathcal{C}$ tends to follow many users in $\mathcal{C}$, while a user followed by many users in $\mathcal{C}$ tends to belong to $\mathcal{C}$. This intuition leads to the bidirectional snowball algorithm. It calculates two vectors, $P = (p_1, \cdots, p_n)$ and $Q = (q_1, \cdots, q_n)$, where $p_u$ represents the relevance score of user $u$ to category $\mathcal{C}$ with respect to the following direction, and $q_u$ represents the relevance score with respect to the follower direction. The updating process is similar to the snowball algorithm:
\begin{eqnarray}\label{eq:bidsnowball}
p_u & = & \sum_{v \in \following(u)} \frac{p_v}{|\following(u)|} \nonumber \\
q_u & = & \sum_{v \in \follower(u)} \frac{q_v}{|\follower(u)|}
\end{eqnarray}

%Considering the graph structure generated by twitter social network, it is similar to web graph structure where users are pages, and there are some outgoing links from $u$ to $v$ since that $u$ is following $v$, and also incoming links from $v$ to $u$ since that $u$ is followed by $v$.

%In order to predict the relationship between keywords or phrase and users, a second general approach is to predict the relationship importance between users in $D$ and users in $L$. For user $u$, an easy way to rank the importance of other users to him is performing the random walk from $u$. After the random walk starting from user $u$ go through his friends links, we obtain the rank of users in which top ranked users might be most important guy that $u$ follows. By the random walk starting from user $u$ go through his follower links, we obtain the rank of users in which top ranked users might be most important guy that followed $u$. For a specified or phrase $z$, we add undirected links from $z$ to users have this keyword. And then, after the random walk from user $u$, we get how important that $z$ is related to $u$ in both directions.

%Define two vectors, $P = (p_1, \cdots, p_n)$ and $Q = (q_1, \cdots, q_n)$, where $p_u$ represents the relevance score of user $u$ to category $\mathcal{C}$ with respect to the following direction, and $q_u$ represents the relevance score with respect to the follower direction.
%However, we don't have computation power to perform this random walk since that there are too many users in social network. We modify this model in reverse direction as the Trust-rank algorithm. For keyword or phrase $z_i$, we use $a_j$ represent the importance that user $u_j$ is related to $z_i$ in friends direction and $s_j$ represent the importance that $u_j$ is related to $z_i$ in followers direction. Additionally, we use vector $P = (p_1, p_2, \cdots, p_n)$ and $Q = (q_1, q_2, \cdots, q_n)$.
%In the following direction, $u$ receives weight from each of his followings with probability $(1 - d) / |\following(u)|$. On the other hand, $u$ receives weight from each of his followers with probability $(1 - d) / |\follower(u)|$ in the follower direction. The random walk is formulated as:
%\begin{eqnarray}\label{eq:randomwalk}
%P & = & (1 - d) M_P P + d T \nonumber \\
%Q & = & (1 - d) M_Q Q + d T
%\end{eqnarray}
%$M_P$ and $M_Q$ are $n \times n$ transition matrices described above. Vector $T$ represents the trust value of each user, where $T_u = \frac{1}{|\mathcal{A}|}$ for $u \in \mathcal{A}$ and $T_u = 0$ for $u \in \mathcal{B}$. $d$ is the damping factor.

The initial value of $p_u$ and $q_u$ is 1 for $u \in \mathcal{A}$, otherwise it is 0. Each iteration updates the value of $P$ and $Q$ based on Eqn.~(\ref{eq:bidsnowball}). Finally, the users are ranked according to the relevance score $s_u = p_u q_u$, which is a combination of relevance to category $\mathcal{C}$ from both following and follower directions.

\subsection{Naive Bayes Algorithm}

The above two algorithms works well in many situations without using any tweets information. %However, the tweets information are ignored in these two approaches.
Our third approach builds a classifier using the tweets information, and predicts how likely a user in $\mathcal{B}$ belongs to category $\mathcal{C}$. The users in $\mathcal{A}$ are positive training examples, and the users in $\mathcal{B}$ are negative training examples.

%Both two models described above works well in some situations. However, we ignore the tweets information and other profile information in our dataset. With the information extracted from our dataset, another general approach to our problem would be to view it as a classification task. We take users that already have some profile keyword as positive training examples, and treat other users as negative training examples. Then we could learn a classifier that predicts how likely that a user is going to have a profile keyword.

There are many machine learning algorithms which address this classification problem. We use the naive Bayes classifier \cite{maron1961automatic} which is one of the most effective and efficient classification algorithm. The classifier separates the users into two classes, users belong to $\mathcal{C}$ ($c = 1$) and users do not belong to $\mathcal{C}$ ($c = 0$).

For each user $u$, let $T_u$ be the collection of $u$'s tweets, location and biography. The word set for corpus $\bigcup_{u = 1}^n T_u$ is $W = \{w_1, \cdots, w_m\}$.
%the set of tweets published by $u$ be $T_u = \{t_1, t_2, \cdots\}$, where each tweet $t_i \in T_u$ contains a set of words from $W = \{w_1, \cdots, w_m\}$. Then $T_u$ can be viewed as a bag of words from $W$.
$\mathbf{1}_{T_u}(w_i)$ is the indicator function of $T_u$, where $\mathbf{1}_{T_u}(w_i)$ equals to 1 if word $w_i$ appears in $T_u$, otherwise it equals to 0. Let $w_i = 1$ represents the event that word $w_i$ occurs, and $w_i = 0$ represents the event that word $w_i$ does not occur.

For each user $u$, the naive Bayes classifier tries to find $i \in \{0, 1\}$ which maximizes
$$p(c = i | w_1 = \mathbf{1}_{T_u}(w_1), \cdots, w_m = \mathbf{1}_{T_u}(w_m)),$$
or equivalently maximizes
$$p(c = i) \prod_{j=1}^m p(w_j = \mathbf{1}_{T_u}(w_j) | c = i).$$
As the problem has only two classes, it is equivalent to determining the sign for $s_u$:
\begin{eqnarray}
s_u & = & \log(p(c = 1)) - \log(p(c = 0)) \nonumber \\
    & + & \sum_{j=1}^m (p(w_j = \mathbf{1}_{T_u}(w_j)|c = 1) - p(w_j = \mathbf{1}_{T_u}(w_j)|c = 0)). \nonumber
\end{eqnarray}
Standard naive Bayes classifier determines whether $u$ belongs to $\mathcal{C}$ based on $s_u$ is positive or negative. Instead of using the sign of $s_u$, we use the value of $s_u$ to determine how likely $u$ belongs to $\mathcal{C}$. The larger the value (positive value) of $s_u$, the more likely $u$ belongs to $\mathcal{C}$; the smaller the value (negative value), the more likely $u$ does not belong to $\mathcal{C}$. Finally, the users in $\mathcal{B}$ are ranked based on the value of $s_u$.

The parameters of the classifier is determined by counting. The value of $p(c = 1)$ is calculated as the fraction of positive training examples, and the value of $p(w_j = \mathbf{1}_{T_u}(w_j)|c = 1)$ is calculated as the number of positive examples such that $w_j$ appears (does not appear) in $T_u$ divided by the number of positive examples if $\mathbf{1}_{T_u}(w_j) = 1$ (if $\mathbf{1}_{T_u}(w_j) = 0$). The calculation of $p(c = 0)$ and $p(w_j = \mathbf{1}_{T_u}(w_j)|c = 0)$ are similar.
%In this algorithm, the learner attempts to construct a classifier for a set of labeled training examples.
%There are several machine learning algorithms that address this problem. Here we use a Naive Bayes model which is one of the most effective and efficient classification algorithm. In this algorithm, the learner attempts to construct a classifier for a set of training example with class labels.

%Assume that $X_1, X_2, \cdots X_n$ are training vectors $X_i=(x_1, x_2, \cdots, x_m)$ and each training vector has a class label $c$. The Bayes estimate $p(x_j|c)$ from the training examples to maximize the probability:
%$$\prod_i p(c)\prod_jp(x_j|c)$$

%In our paper, we apply this Naive Bayes method to learn the probability that user $u_j$ has the profile $z_i$. And return the sorted list according the probability.

%The first is that the class is imbalance, for each profile $z_i$, there will be a very small fraction of the users in the network and learning is particularly hard in domains with high class imbalance. The second challenge is that there are huge amount of features. There are lots different words appeared in users tweets and also typos and connected words. Therefore, we modify the Naive Bayes algorithm so that for each profile keyword $z_i$, the learner could give user relevant score according to:
%We modify the Naive Bayes algorithm to calculate the relevant score according to the following equation:
%\begin{equation}
%r = \frac{1}{n_i}\sum_{i=1}^{n_i} \log p(c|w_i)
%\end{equation}

\subsection{Co-training Algorithm}

%The random walk model works well for profile which users with this profile like connected with each other. For example, users in same companies or universities would like to follow each other. The Naive Bayes model works well for profile which users with this profile like to talk about same topics. For example, users like comic probably didn't know each other and follow each, but they tweet same words for same topic.

%There is a general approach to combine two different models together and generate better result for different kind of profile keyword.
The bidirectional snowball algorithm and naive Bayes algorithm focus on different perspective of twitter networks: the former focus on the network structure level, and the later focus on the tweets information level. Both of them provide reasonably well rankings for the training data, while the perspectives of these two algorithms are conditionally independent given the ranking. A more powerful approach called co-training \cite{cotraining} combines the two algorithms together, and iteratively reinforce the result of one algorithm by the result of the other algorithm.

%Because our two models (i) based on different prospective of user behaviors; (ii) each training method is sufficient to train a good ranker when we have enough training data; (iii) two prospective of user behaviors are conditionally independent given the class. So we use co-training \cite{cotraining} method to combine two approaches.

Co-training algorithm is described in Algorithm~\ref{alg:cotrain}. Users are partitioned into two disjoint sets $\mathcal{A}$ and $\mathcal{B}$ described in Section~\ref{sec:problem}. $l$ and $k$ are positive integer parameters, and $k > l$. During each iteration, top $l$ users generated by the naive Bayes algorithm and top $l$ users generated by bidirectional snowball algorithm are merged into set $\mathcal{A}$. The merged users are considered as positive training examples during later training process. The iteration is executed until the top $k$ users generated by the bidirectional snowball algorithm and naive Bayes algorithm are the same. The result of the last execution of the naive Bayes algorithm is used as the final result of the co-training algorithm.

\begin{algorithm}[htbp]
\caption{\textsc{Co-training}}
\textbf{Input: }{Category $\mathcal{C}$, two disjoint sets $\mathcal{A}$ and $\mathcal{B}$, parameter $k$ and $l$}\\
\textbf{Output: }{An array $rank$ containing users in $\mathcal{B}$ ranked on the probability of belonging to $\mathcal{C}$}
\begin{algorithmic}[1]
\REPEAT
\STATE $rank^{\prime}$ $\leftarrow$ bidirectional snowball algorithm($\mathcal{A}$, $\mathcal{B}$)
\STATE $rank$ $\leftarrow$ naive Bayes algorithm($\mathcal{A}$, $\mathcal{B}$)
\STATE $\mathcal{A} \leftarrow \mathcal{A} ~ + $ \{top $l$ users in $rank^{\prime}$\}
\STATE $\mathcal{A} \leftarrow \mathcal{A} ~ + $ \{top $l$ users in $rank$\}
\UNTIL{Top $k$ users in $rank^{\prime}$ and $rank$ are the same}
\RETURN $rank$
\end{algorithmic}
\label{alg:cotrain}
\end{algorithm}

\ifx \allfiles \undefined
\end{document}
\fi
